Design a TM2 PepSeq library for our second cohort of patients for the Tumor Infiltrating Lymphocytes (TIL) Project. Exome sequencing and initial sequence analysis was performed at TGen HQ. A collection of files has been provided by Kevin Drenner in the Sharma lab. Each file corresponds to a unique patient, prefaced by ‘C038,’ or a unique mouse, ‘mm10.’ This analysis will gather the data set and perform some pre-processing steps prior to using an external oligo_encoding tool written by Zane Fink in the Ladner lab at NAU to select the best peptides for the final PepSeq library.
Start with importing the necessary libraries.
library(tidyverse)
library(MHCbindR)
library(DT)
create_dt <- function(x) {
DT::datatable(x,
extensions = "Buttons",
options = list(
dom = "Blfrtip",
buttons = c("copy", "csv", "excel", "pdf", "print"),
lengthMenu = list(
c(10, 25, 50, -1),
c(10, 25, 50, "All")
)
)
)
}
Import tumor mutations data generated by Kevin Drenner at TGen HQ.
Create a single data frame of all mutations, data. Count the number of rows in each file; this is a rough indicator of the number of variants in a file. Plot number of rows per file.
files <- dir("data_from_kevin/TILPepseq2_Updated/TILPepseq2_mutations/", pattern = "*.csv")
data <- files %>%
map_dfr(function(x) {
read_csv(file.path("data_from_kevin/TILPepseq2_Updated/TILPepseq2_mutations/", x)) %>% mutate(file = x)
})
glimpse(data)
## Rows: 5,211
## Columns: 36
## $ variantId <chr> "chr1 g.109492471A>T", "chr1 g.110022034G>A", "chr1 g.1100…
## $ mutPept1 <chr> "NHKLDSLTYKIDECE", "ENCWFVFKETPWHGQ", "ENCWFVFKETPWHGQ", "…
## $ mutPept2 <chr> "CNHKLDSLTYKIDEC", "AENCWFVFKETPWHG", "AENCWFVFKETPWHG", "…
## $ mutPept3 <chr> "ECNHKLDSLTYKIDE", "WAENCWFVFKETPWH", "WAENCWFVFKETPWH", "…
## $ mutPept4 <chr> "SECNHKLDSLTYKID", "LWAENCWFVFKETPW", "LWAENCWFVFKETPW", "…
## $ mutPept5 <chr> "ISECNHKLDSLTYKI", "FLWAENCWFVFKETP", "FLWAENCWFVFKETP", "…
## $ mutPept6 <chr> "EISECNHKLDSLTYK", "FFLWAENCWFVFKET", "FFLWAENCWFVFKET", "…
## $ mutPept7 <chr> "DEISECNHKLDSLTY", "NFFLWAENCWFVFKE", "NFFLWAENCWFVFKE", "…
## $ mutPept8 <chr> "ADEISECNHKLDSLT", "INFFLWAENCWFVFK", "INFFLWAENCWFVFK", "…
## $ mutPept9 <chr> "CADEISECNHKLDSL", "FINFFLWAENCWFVF", "FINFFLWAENCWFVF", "…
## $ mutPept10 <chr> "SCADEISECNHKLDS", "GFINFFLWAENCWFV", "GFINFFLWAENCWFV", "…
## $ mutPept11 <chr> "LSCADEISECNHKLD", "FGFINFFLWAENCWF", "FGFINFFLWAENCWF", "…
## $ mutPept12 <chr> "DLSCADEISECNHKL", "LFGFINFFLWAENCW", "LFGFINFFLWAENCW", "…
## $ mutPept13 <chr> "PDLSCADEISECNHK", "VLFGFINFFLWAENC", "VLFGFINFFLWAENC", "…
## $ mutPept14 <chr> "SPDLSCADEISECNH", "LVLFGFINFFLWAEN", "SVLFGFINFFLWAEN", "…
## $ mutPept15 <chr> "VSPDLSCADEISECN", "PLVLFGFINFFLWAE", "ISVLFGFINFFLWAE", "…
## $ effectId <chr> "ENST00000302500_p.Y68N|ENST00000348264_p.Y68N|ENST0000035…
## $ gene_name <chr> "CLCC1", "SYPL2", "SYPL2", "PROK1", "PIFO", "PTPN22", "NRA…
## $ topEffect <lgl> TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…
## $ wtPept1 <chr> "YHKLDSLTYKIDECE", "GNCWFVFKETPWHGQ", "GNCWFVFKETPWHGQ", "…
## $ wtPept2 <chr> "CYHKLDSLTYKIDEC", "AGNCWFVFKETPWHG", "AGNCWFVFKETPWHG", "…
## $ wtPept3 <chr> "ECYHKLDSLTYKIDE", "WAGNCWFVFKETPWH", "WAGNCWFVFKETPWH", "…
## $ wtPept4 <chr> "SECYHKLDSLTYKID", "LWAGNCWFVFKETPW", "LWAGNCWFVFKETPW", "…
## $ wtPept5 <chr> "ISECYHKLDSLTYKI", "FLWAGNCWFVFKETP", "FLWAGNCWFVFKETP", "…
## $ wtPept6 <chr> "EISECYHKLDSLTYK", "FFLWAGNCWFVFKET", "FFLWAGNCWFVFKET", "…
## $ wtPept7 <chr> "DEISECYHKLDSLTY", "NFFLWAGNCWFVFKE", "NFFLWAGNCWFVFKE", "…
## $ wtPept8 <chr> "ADEISECYHKLDSLT", "INFFLWAGNCWFVFK", "INFFLWAGNCWFVFK", "…
## $ wtPept9 <chr> "CADEISECYHKLDSL", "FINFFLWAGNCWFVF", "FINFFLWAGNCWFVF", "…
## $ wtPept10 <chr> "SCADEISECYHKLDS", "GFINFFLWAGNCWFV", "GFINFFLWAGNCWFV", "…
## $ wtPept11 <chr> "LSCADEISECYHKLD", "FGFINFFLWAGNCWF", "FGFINFFLWAGNCWF", "…
## $ wtPept12 <chr> "DLSCADEISECYHKL", "LFGFINFFLWAGNCW", "LFGFINFFLWAGNCW", "…
## $ wtPept13 <chr> "PDLSCADEISECYHK", "VLFGFINFFLWAGNC", "VLFGFINFFLWAGNC", "…
## $ wtPept14 <chr> "SPDLSCADEISECYH", "LVLFGFINFFLWAGN", "SVLFGFINFFLWAGN", "…
## $ wtPept15 <chr> "VSPDLSCADEISECY", "PLVLFGFINFFLWAG", "ISVLFGFINFFLWAG", "…
## $ sourceName <chr> "C038_0022_009388_PB_Whole_C1_K1ID2_A50599-C038_0022_00938…
## $ file <chr> "C038_0022_merged_Ashion_Research_Peptide_100PercentIdenti…
table(data$file)
##
## C038_0022_merged_Ashion_Research_Peptide_100PercentIdentityRemoved.varCode.csv
## 915
## C038_0031_merged_Ashion_Research_Peptide.varCode.csv
## 97
## C038_0034_merged_Ashion_Research_Peptide_100PercentIdentityRemoved.varCode.csv
## 120
## C038_0036_merged_Ashion_Research_Peptide_100PercentIdentityRemoved.varCode.csv
## 105
## C038_0038_merged_Ashion_Research_Peptide.varCode.csv
## 68
## C038_0039_merged_Ashion_Research_Peptide_100PercentIdentityRemoved.varCode.csv
## 49
## C038_0044_merged_Ashion_Research_Peptide.varCode.csv
## 87
## C038_0045_merged_Ashion_Research_Peptide_100PercentIdentityRemoved.varCode.csv
## 280
## C038_0046_merged_Ashion_Research_Peptide_100PercentIdentityRemoved.varCode.csv
## 219
## Phiser_mm10_B16F10_Final.varCode.csv
## 1042
## Phiser_mm10_MC38_Phiser_Final.varCode.csv
## 2229
p1 <- data %>%
mutate(file = str_replace(file, ".varCode.csv", replacement = "")) %>%
ggplot(aes(file)) +
geom_histogram(stat = "count") +
coord_flip() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
theme_bw()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
p1
Select mutations 7, 8, 9 for wildtype and mutant sequences. Gather into a long data frame. Drop any rows with NA for sequence. Filter for peptides with topEffect == TRUE and 15mers.
peptides_long <- data %>%
gather(
key = "peptide", value = "sequence", mutPept7,
mutPept8, mutPept9, wtPept7, wtPept8, wtPept9
) %>%
select(variantId, effectId, gene_name, topEffect, sourceName, file, peptide, sequence) %>%
filter(!is.na(sequence)) %>%
relocate(effectId, .after = sequence)
create_dt(peptides_long)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
We only want to include variants with topEffect and length=15. Summarizing the non-topEffect and non-15mers here to show what has been excluded from the library design.
not_topEffect <- peptides_long %>%
filter(topEffect == FALSE)
create_dt(not_topEffect)
not_15mers <- peptides_long %>%
filter(!(str_length(sequence) == 15))
create_dt(not_15mers)
After filtering for topEffect == TRUE, there are some duplicated sequences remaining. Some duplicates are across multiple patients, whereas others are across the same patient, just from another sample/sequence library.
pep_dups <- peptides_long %>%
filter(topEffect == TRUE) %>%
ungroup() %>%
group_by(sequence) %>%
mutate(dup = n() > 1) %>%
filter(dup == TRUE)
create_dt(pep_dups)
Need to just include a single peptide sequence for the duplicates in our peptide pool for library design. annotated_peptides won’t include information from duplicated sequences, so if we need the metadata for duplicates, will have to pull from the pep_dups data frame.
annotated_peptides <- peptides_long %>%
filter(topEffect == TRUE) %>%
filter(str_length(sequence) == 15)
Keep only rows with distinct peptide sequences. distinct keeps only the first row if there are replicated sequences. .keep all retains the other columns.
unique_peptides <- peptides_long %>%
filter(topEffect == TRUE) %>%
filter(str_length(sequence) == 15) %>%
distinct(sequence, .keep_all = TRUE)
Create a data frame with named unique peptides to serve as input for the PepSeq Library Design tool; this data frame is called named_peptides. Export named_peptides.csv to use as the import for the oligo_encoding tool.
for_library_design <- unique_peptides %>% select(sequence)
for_library_design <- tibble(for_library_design)
for_library_design$id <- str_c(rep("TM2_", length(for_library_design$sequence)), sprintf("%05d", seq_along(for_library_design$sequence)))
# This file contains 2 columns – the first with one entry for each unique peptide; the second with the peptide identifier (eg "TM1_00001")
named_peptides <- for_library_design %>% select(id, sequence)
create_dt(named_peptides)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
# write_csv(named_peptides, path = "design_outs/named_peptides.csv", col_names = FALSE, quote = FALSE)
# write_csv(annotated_peptides, path = "design_outs/annotated_peptides.csv")
oligo_encoding script to choose the best peptides for the PepSeq library.oligo_encoding tool requires a specific version of h2O, along with some other dependencies (details on Ladner lab github). I am using the oligo_encoding script from the Library-Design tool (pulled from git sha: 416bee3458855e76dc73d10a87515144abbee799).cat analysis/bash/conda_pepseq_design.yml
## name: pepseq_design
## channels:
## - defaults
## - conda-forge
## - bioconda
## - r
## dependencies:
## - ca-certificates=2019.1.23=0
## - certifi=2019.3.9=py37_0
## - libedit=3.1.20181209=hc058e9b_0
## - libffi=3.2.1=hd88cf55_4
## - libgcc-ng=8.2.0=hdf63c60_1
## - libstdcxx-ng=8.2.0=hdf63c60_1
## - llvm-openmp=8.0.0=hc9558a2_0
## - ncurses=6.1=he6710b0_1
## - openmp=8.0.0=0
## - openssl=1.1.1b=h7b6447c_1
## - pip=19.0.3=py37_0
## - python=3.7.3=h0371630_0
## - readline=7.0=h7b6447c_5
## - setuptools=41.0.0=py37_0
## - sqlite=3.28.0=h7b6447c_0
## - tk=8.6.8=hbc83047_0
## - wheel=0.33.1=py37_0
## - xz=5.2.4=h14c3975_4
## - zlib=1.2.11=h7b6447c_3
## - pip:
## - chardet==3.0.4
## - colorama==0.4.1
## - future==0.17.1
## - h2o==3.20.0.8
## - idna==2.8
## - numpy==1.16.3
## - pandas==0.24.2
## - python-dateutil==2.8.0
## - pytz==2019.1
## - requests==2.21.0
## - six==1.12.0
## - tabulate==0.8.3
## - urllib3==1.24.2
## prefix: /home/ekelley/bin/anaconda3/envs/pepseq_design
oligo_encoding analysis.cat analysis/bash/run_pepseq_design_step1.sh
## #!/usr/bin/bash
##
## #SBATCH --time=96:00:00
## #SBATCH --mem=100G
## #SBATCH --partition=hmem
## #SBATCH -c 2
## #SBATCH --mail-type=ALL
## #SBATCH--job-name=pepseq1
## #SBATCH--mail-user=ekelley@tgen.org
## #SBATCH --nice=10
##
## /labs/Immunology/ekelley/Library-Design/scripts/oligo_encoding/main \
## -r /scratch/ekelley/TM2_PepSeq_Library_Design/output_ratio \
## -s /scratch/ekelley/TM2_PepSeq_Library_Design/out_seqs \
## -n 300 \
## -c 2 \
## -p /labs/Immunology/ekelley/Library-Design/scripts/oligo_encoding/codon_weights.csv \
## -i /scratch/ekelley/TM2_PepSeq_Library_Design/named_peptides.csv \
## -t 10000 \
## -g 0.55 \
##
oligo_encoding analysis.-n 3 will give the three best encodings per peptide. This was performed on our HPC cluster using the following slurm script; note that you must activate the conda environment prior to running the slurm script. This analysis took slightly more than an hour using 2 cores with 40G memory.cat analysis/bash/run_pepseq_design_step2.sh
## #!/usr/bin/bash
##
## #SBATCH --time=96:00:00
## #SBATCH --mem=40G
## #SBATCH -c 2
## #SBATCH --mail-type=ALL
## #SBATCH--job-name=neuralnet
## #SBATCH--mail-user=ekelley@tgen.org
##
## #source activate pepseq_design
##
## /labs/Immunology/ekelley/Library-Design/scripts/oligo_encoding/encoding_with_nn.py \
## -m /labs/Immunology/ekelley/Library-Design/scripts/oligo_encoding/DeepLearning_model_R_1539970074840_1_20181019 \
## -r /scratch/ekelley/TM2_PepSeq_Library_Design/output_ratio \
## -s /scratch/ekelley/TM2_PepSeq_Library_Design/out_seqs \
## -o best_encodings \
## --subsample 300 \
## --read_per_loop 10 \
## -n 3 \
##
best_encodings output from oligo_encoding:best_encodings <- read_csv("analysis/bash/best_encodings")
create_dt(best_encodings)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html